package com.finance.cooperate.common.utils;


import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;

/**
 * @ClassName SmsPreprocessV2Utils
 * @Description 短信预处理成 Tokens
 * @Author shen
 * @Date 2023/8/4 16:32
 * @Modify ...
 */
public final class SmsPreprocessV2Utils {

    private final static Set<String> stopWords = new HashSet<>();

    /**
     * @Author shen
     * @Description 获取预处理后的短信信息
     * @Date 18:10 2023/8/2
     * @Param [txt]
     * @return java.lang.String
     **/
    public static String getTokensJoin (String txt) {

        StringBuilder result = new StringBuilder();

        txt = preTxt(txt);

        StringTokenizer tokenizer = new StringTokenizer(txt);

        while (tokenizer.hasMoreTokens()) {
            String word = tokenizer.nextToken();
            if (!stopWords.contains(word)) {
                result.append(word).append(" ");
            }

        }

        return result.toString().trim();

    }


    /**
     * @Author shen
     * @Description 基本处理
     * @Date 18:11 2023/8/2
     * @Param [txt]
     * @return java.lang.String
     **/
    private static String preTxt(String txt) {

        txt = txt.replaceAll("https?://(?:[-\\w.]|(?:%[\\da-fA-F]{2}))+", " ");
        // 去数字替换为x
        txt = txt.replaceAll("[0-9]", " ");
        // 统一小写
        txt = txt.toLowerCase();
        txt = txt.replaceAll("\\n", " ");
        // 去除非英文字符并替换为空格
        txt = txt.replaceAll("[^a-zA-Z]", " ");

        return txt;

    }

    static {

        stopWords.addAll(
                Arrays.asList(
                        "", "pm", "nevertheless", "containing", "described", "we're", "xxx", "sure", "nothing", "awfully",
                        "concerning", "my", "works", "neither", "ah", "older", "gotten", "thinks", "causes", "non", "ain",
                        "possibly", "thorough", "]", "'m", "their", "everywhere", "until", "index", "two", "recently",
                        "still", "whod", "d", ":", "needn't", "importance", "in", "your", "much", "followed", "parts",
                        "(", "are", "needing", "<", "everything", "else", "ref", "anywhere", "consequently", "opens",
                        "ours", "c", "indeed", ",", "necessary", "am", "that", "herein", "others", "lately", "next",
                        "four", "part", "y", "little", "becoming", "then", "do", "omitted", "i'll", "sides", "definitely",
                        "somewhat", "auth", "therein", "trying", "new", "open", "way", "inner", "abst", "grouped", "normally",
                        "probably", "either", "xxxxxxxxx", "overall", "need", "thereof", "able", "youngest", "last", "both",
                        "elsewhere", "she", "is", "similarly", "might", "herself", "theres", "a", "cannot", "section",
                        "name", "faces", "wish", "can't", "wanting", "m", "seen", " ", "currently", "other", "hereupon",
                        "couldn't", "given", "specify", "often", "said", "thru", "showed", "thank", "highest", "what",
                        "generally", "s", "these", "toward", "what's", "within", "worked", "show", "latest", "seemed", "j",
                        "somethan", "yes", "accordance", "wherever", "former", "particular", "ran", "when", "while", "comes",
                        "thoughts", "value", "different", "taken", "cases", "haven't", "case", "took", "they'd", "noted",
                        "him", "beforehand", "outside", "na", "what'll", "id", "significant", "done", "some", "adopted",
                        "old", "via", "whatever", "try", "according", "nearly", "none", "away", "sometime", "opening", "youd",
                        "~", "getting", "greater", "km", "'re", "three", "it", "now", "everyone", "thanx", "latter", "behind",
                        "following", "and", "thereby", "im", "place", "tip", "keys", "slightly", "t's", "although", "could",
                        "&", "has", "sec", "across", "actually", "arise", "downing", "why", "number", "only", "self", "differ",
                        "making", "you're", "mainly", "clearly", "couldn", "_", "asks", "state", "w", "quickly", "soon", "oh",
                        "with", "thousand", "uucp", "an", "ts", "there's", "seeing", "home", "weren", "discuss", "sent", "know",
                        "sees", "+", "go", "stop", "hence", "states", "since", "who", "sometimes", "regarding", "shown",
                        "does", "inc", "later", "going", "began", "ltd", "hi", "results", "backing", "'ll", "insofar",
                        "under", "v", "how", "line", "shouldn't", "who'll", "if", "presenting", "use", "p", "newer", "be",
                        "felt", "you", "likely", "approximately", "asked", "resulting", "wasn't", "such", "}", "hid", "namely",
                        ")", "refs", "won't", "hadn't", "whereupon", "u", "indicated", "those", "consider", "com", "things",
                        "make", "noone", "\"", "six", "here's", "ly", "rather", "every", "though", "previously", "th", "ain't",
                        "plus", "furthermore", "on", "date", "etc", "really", "or", "man", "doing", "willing", "end", "=",
                        "allows", "moreover", "^", "%", "by", "information", "r", "that's", "showns", "see", "may", "doesn",
                        "k", "problems", "obviously", "announce", "ex", "must", "looks", "gets", "o", "hundred", "opened",
                        "pointed", "lest", "first", "yet", "nay", "/", "oldest", "this", "certain", "interests", "throug",
                        "eight", "look", "strongly", "itd", "were", "invention", "member", "hers", "e", "through", "\\", "didn",
                        "second", "today", "vs", "think", "viz", "welcome", "h", "interested", "before", "q", "inward", "meanwhile",
                        "even", "widely", "successfully", "greetings", "can", "where", "you'll", "past", "towards", "taking",
                        "turned", "thence", "-", "got", "zz", "gave", "because", "ma", "zt", "please", "anybody", "was",
                        "tried", "very", "parting", "already", "ninety", "hither", "certainly", "itself", "anyone", "cause",
                        "words", "been", "maybe", "nos", "i'd", "point", "ways", "where's", "needn", "face", "describe",
                        "tries", "thered", "affecting", "owing", "shall", "wheres", "any", "respectively", "room", "believe",
                        "mightn't", "useful", "fix", "pages", "seems", "i'm", "good", "pp", "whereby", "ve", "so", "recent",
                        "cant", "similar", "his", "merely", "she's", "anyway", "heres", "follows", "everybody", "xxxxx",
                        "appreciate", "theirs", "smallest", "pointing", "ZZ", "early", "mg", "wasn", "regards", "whomever",
                        "kept", "especially", "theyd", "she'll", "formerly", "kind", "just", "backed", "beginnings",
                        "particularly", "specifying", "thoughh", "ignored", "poorly", "hopefully", "into", "says", "forth",
                        "full", "hereafter", "differently", "effect", "than", "would", "rooms", "research", "hes", "yours",
                        "provides", "beside", "example", "xxxx", "against", "unto", "ie", "wells", "himself", "therefore",
                        "whoever", "whose", "well", "gives", "presumably", "kb", "become", "take", "changes", "above", "won",
                        "downed", "but", "co", "exactly", "he", "nd", "members", "b", "had", "'", "ever", "downs", "fact",
                        "seven", "except", "them", "after", "shouldn", "upon", "per", "couldnt", "men", "problem", "clear",
                        "qv", "$", "placed", "f", "amongst", "they", "appropriate", "important", "immediately", "far", "wouldn",
                        "gone", "keep", "makes", "did", "@", "somebody", "tends", "higher", "we've", "mightn", "don", "corresponding",
                        "whither", "anything", "shan", "edu", "various", "knows", "who's", "uses", "without", "along", "adj",
                        "indicate", "thereto", "they'll", "about", "sensible", "there'll", "areas", "aren't", "throughout",
                        "me", "million", "serious", "say", "potentially", "*", "beginning", "down", "ones", "come", "however",
                        "interesting", "we'll", "from", "usefully", "one", "sorry", "novel", "ordering", "he's", "et", "ends",
                        "somewhere", "they're", "associated", "affects", "appear", "facts", "allow", "working", "predominantly",
                        "shes", "hasn", "due", "big", "something", "like", "work", "groups", "sup", "#", "each", "again", "www",
                        "best", "|", "reasonably", "we'd", "alone", "que", "obtained", ".", "largely", "not", "hed", "ZT", "year",
                        "want", "indicates", "less", "nine", "her", "afterwards", "least", "c's", "perhaps", "readily", "miss",
                        "wanted", "arent", "tell", "great", "needed", "unless", "parted", "finds", "newest", "out", "you've",
                        "near", "seeming", "a's", "unlike", "same", "nonetheless", "once", "longest", "found", "that've",
                        "longer", "lets", "ord", "resulted", "sufficiently", "right", "wherein", "present", "un", "thou",
                        "whenever", "enough", "kg", "yourself", "became", "onto", "cr", "begin", "usually", "specifically",
                        ";", "'s", "backs", "becomes", "despite", "for", "youre", "fifth", "seem", "smaller", "furthered",
                        "to", "order", "til", "haven", "points", "shows", "promptly", "get", "help", "ought", "significantly",
                        "z", "happens", "thoroughly", "[", "act", "furthering", "ask", "thereafter", "'ve", "aren", "long",
                        "mustn", "quite", "anymore", "wants", "obtain", "page", "at", "having", "whim", "almost", "don't",
                        "`", "mr", "put", "seriously", "considering", "primarily", "seconds", "up", "?", "hereby", "will",
                        "myself", "furthers", "'t", "goes", "someone", "unfortunately", "c'mon", "unlikely", "\n", "own",
                        "mustn't", "made", "selves", "yourselves", "too", "you'd", "truly", "all", "wouldn't", "it'll",
                        "looking", "they've", "otherwise", "whereafter", "grouping", "necessarily", "relatively",
                        "somehow", "further", "possible", "hardly", "anyways", "usefulness", "younger", "rd", "several",
                        "n", "sub", "beyond", "aside", "came", "which", "more", "related", "re", "ordered", "small", "liked",
                        "wed", "keeps", "presents", "thanks", "there've", "contain", "include", "wonder", "ended", "entirely",
                        "needs", "themselves", "isn't", "general", "among", "its", "went", "it's", "inasmuch", "mrs", "side",
                        "knew", "whence", "!", "giving", "begins", "ll", "ff", "vol", "evenly", "ca", "whether", "howbeit",
                        "using", "most", "between", "large", "contains", "apart", "i've", "biol", "nowhere", "interest",
                        "isn", "have", "ac", "instead", "secondly", "anyhow", "turning", "we", "whom", "course", "five",
                        "asking", "another", "mean", "it'd", "ml", "nor", "ending", "thing", "thereupon", "eighty", "whereas",
                        "g", "hasn't", "during", "available", "vols", "few", "greatest", "area", "suggest", "apparently",
                        "also", "whos", "fully", "used", "shed", "l", "latterly", "https", "accordingly", "nobody", "added",
                        "always", "presented", "x", "thats", "being", "eg", "high", "let's", "downwards", "goods", "specified",
                        "run", "{", "known", "ups", "shan't", "ourselves", "back", "immediate", "third", "the", "let",
                        "didn't", "off", "no", "affected", "ok", "therere", "twice", "as", "doesn't", "should", "world",
                        "thus", "dt", "ed", "there", "xx", "whole", "saying", "places", "young", "mostly", "meantime", ">",
                        "whats", "hadn", "briefly", "orders", "below", "better", "group", "thought", "turn", "never", "puts",
                        "hello", "over", "that'll", "theyre", "numbers", "proud", "i", "substantially", "et-al", "mug",
                        "besides", "of", "years", "turns", "n't", "here", "showing", "weren't", "zero", "t", "many",
                        "around", "should've", "find", "okay", "together", "our", "give", "us", "beings", "brief", "saw",
                        "means", "regardless"
                )
        );

    }



}
